import pandas as pd # import pandas library
import numpy as np # import numpy library
import matplotlib.pyplot as plt # import matplotlib library
from tqdm import tqdm # import tqdm library for loading bars
import seaborn as sns # import seaborn library
np.random.seed(1727546) # set np.random seed to CID number
sns.set(rc={'figure.figsize':(20,15)}) # set seaborn figure size
plt.rcParams['figure.dpi']= 300 # set matplotlib figure dpi
plt.rcParams['figure.figsize'] = 8, 6
We first start by loading the data and performing some elementary data analysis:
airfoil_noise_train = pd.read_csv("data/airfoil_noise_samples.csv") # read in airfoil noise training data
airfoil_columns = airfoil_noise_train.columns.to_numpy()[:-1] # get column names of airfoil noise training data
airfoil_noise_train.head() # display first 5 rows of airfoil noise training data
| Frequency | Angle | Displacement | Chord length | Velocity | Thickness | Sound Pressure | |
|---|---|---|---|---|---|---|---|
| 0 | 2175.611424 | 15.138865 | 21.075119 | 0.088194 | 66.764401 | 0.044771 | 122.365215 |
| 1 | 2962.923620 | 13.400893 | 13.200889 | 0.048462 | 78.221903 | 0.011041 | 129.296236 |
| 2 | 4430.810843 | 2.164599 | 13.959536 | 0.226743 | 57.053201 | 0.011499 | 121.827380 |
| 3 | 4939.695645 | 13.857682 | 18.203793 | 0.021705 | 23.896377 | 0.021475 | 114.998132 |
| 4 | 2193.979785 | 9.298757 | 11.007713 | 0.052737 | 38.917034 | 0.001741 | 125.639641 |
We also load the test data for later use.
airfoil_noise_test = pd.read_csv("data/airfoil_noise_test.csv") # read in airfoil noise test data
airfoil_noise_test.head() # display first 5 rows of airfoil noise test data
| Frequency | Angle | Displacement | Chord length | Velocity | Thickness | Sound Pressure | |
|---|---|---|---|---|---|---|---|
| 0 | 1143.654418 | 12.783087 | 15.277127 | 0.110389 | 46.089548 | 0.024076 | 125.332111 |
| 1 | 619.208992 | 4.460285 | 16.198696 | 0.284357 | 36.728360 | 0.004000 | 122.801183 |
| 2 | 646.114737 | 2.521952 | 7.533605 | 0.072292 | 59.498237 | 0.003590 | 129.361188 |
| 3 | 1246.777461 | 8.438129 | 12.396435 | 0.034411 | 47.557277 | 0.002407 | 125.656263 |
| 4 | 286.200927 | 12.238350 | 20.877161 | 0.082437 | 72.786590 | 0.036883 | 124.241736 |
airfoil_noise_test.describe() # display summary statistics of airfoil noise test data
| Frequency | Angle | Displacement | Chord length | Velocity | Thickness | Sound Pressure | |
|---|---|---|---|---|---|---|---|
| count | 973.000000 | 973.000000 | 973.000000 | 973.000000 | 973.000000 | 973.000000 | 973.000000 |
| mean | 2899.010272 | 8.168879 | 13.457254 | 0.137074 | 50.775639 | 0.014163 | 123.968845 |
| std | 2678.943633 | 5.606230 | 4.551486 | 0.093508 | 15.940928 | 0.014034 | 4.654713 |
| min | 25.769177 | 0.011299 | 0.082166 | 0.000848 | 21.139451 | 0.000002 | 107.996004 |
| 25% | 1076.460339 | 3.618578 | 10.562385 | 0.054891 | 36.987297 | 0.004148 | 121.106574 |
| 50% | 2005.178515 | 7.249896 | 13.971662 | 0.122195 | 47.345385 | 0.008715 | 124.126098 |
| 75% | 3842.996299 | 11.799025 | 16.506753 | 0.208549 | 66.755918 | 0.019872 | 127.207364 |
| max | 19744.221140 | 26.623922 | 26.393545 | 0.360646 | 84.766450 | 0.068163 | 136.422496 |
Lastly in our EDA, we produce a pairplot of the features:
sns.pairplot(airfoil_noise_train, kind='reg', corner=True, plot_kws = {'scatter_kws': {'alpha': 0.1}}) # plot pairplot of airfoil noise training data
plt.suptitle("Pairplot of Airfoil Noise Training Data", y=1.02, fontsize=20) # set title of pairplot
plt.show()
Throughout, let $\mathbf{x}_i\in\mathbb{R}^D$ for $i=1,\dots, N$ be the predictor data column vectors, i.e. the vectors containing all of the data for a given observation except the Sound Pressure. Moreover, let $\beta_i$ for $i=1,\dots,6$ be the $i$ th predictor variable, where the predictor variables are in the same order they appear in the columns, and let $\beta_7$ be the Sound Pressure.
We now split the data into our predictor matrix $\mathbf{X}\in\mathbb{R}^{N\times D}$, defined as:
$$ \mathbf{X} = \begin{pmatrix} \mathbf{x}_1^\top\\ \mathbf{x}_2^\top\\ \vdots\\ \mathbf{x}_N^\top \end{pmatrix} $$and predicted variable $\mathbf{y}$. In this case, $\mathbf{(y)}_i$ is the observed sound pressure for the $i$ th observation.
We split our train and test data into the predictor variable matrix, $\mathbf{X}$, and our predicted variable, $\mathbf{y}$.
The first step is to standardise our predictor variable matrix $\mathbf{X}$; this is necessary to be able to compare the parameters we are optimising to each other, and to reduce $\mathbf{X}$'s condition number, as in its unstandardised state it is $\approx 400,000$, which is very high and will make gradient descent difficult.
# drop target column and convert to numpy array
X_train_unstd = airfoil_noise_train.drop("Sound Pressure", axis=1).to_numpy()
# convert to numpy array
y_train = airfoil_noise_train["Sound Pressure"].to_numpy()
# drop target column and convert to numpy array
X_test_unstd = airfoil_noise_test.drop("Sound Pressure", axis=1).to_numpy()
# convert to numpy array
y_test = airfoil_noise_test["Sound Pressure"].to_numpy()
# print condition number of unstandardised matrix X
print(f"Condition number of unstandardised matrix X: {round(np.linalg.cond(X_train_unstd))}")
Condition number of unstandardised matrix X: 411342
We now proceed to standardise our predictor variable matrix, and we change the dimensions of our predicted variable vector $\mathbf{y}$ so numpy interprets it as a column vector as opposed to a row vector. Additionally, we shuffle our data to exclude any possible ordering the data may be in:
X_train = (X_train_unstd - np.mean(X_train_unstd, axis=0))/np.std(X_train_unstd, axis=0) # standardise X_train
X_test = (X_test_unstd - np.mean(X_train_unstd, axis=0))/np.std(X_train_unstd, axis=0) # standardise X_test
N,D = np.shape(X_train) # get number of samples and number of features
y_train = y_train.reshape(N,1) # reshape y_train to be a column vector
y_test = y_test.reshape(np.shape(X_test)[0],1) # reshape y_test to be a column vector
# shuffle training data
train_indices = np.arange(N) # create array of indices
np.random.shuffle(train_indices) # shuffle indices
X_train = X_train[train_indices] # shuffle X_train
y_train = y_train[train_indices] # shuffle y_train
# shuffle test data
test_indices = np.arange(np.shape(X_test)[0]) # create array of indices
np.random.shuffle(test_indices) # shuffle indices
X_test = X_test[test_indices] # shuffle X_test
y_test = y_test[test_indices] # shuffle y_test
We display the condition number of the standardised $\mathbf{X}$ matrix to see how it changes after standardising:
print(f"Condition number of standardised matrix X: {round(np.linalg.cond(X_train))}") # print condition number of standardised matrix X
Condition number of standardised matrix X: 5
We can see it has drastically reduced to $5$, making gradient descent converge faster and thus usable.
We now turn our attention to the loss function:
$$ L(\beta) = \frac{1}{2N}\|\mathbf{y} - \mathbf{X}\beta-\mathbf{\beta_0}\|^2 $$Where $\beta\in\mathbb{R}^D,\mathbf{\beta_0}\in\mathbb{R}^N$. Note that $\mathbf{\beta_0} = \beta_0 \mathbf{z}_N$, where $\mathbf{z}_N := \begin{pmatrix}1\\1\\\vdots\\1\end{pmatrix}\in\mathbb{R}^N$
Consider one coordinate of $\mathbf{y},\, y_i$. Then we wish to minimise: $$ y_i = \mathbf{x}_i^\top\beta + \beta_0 = \begin{pmatrix}1 & \mathbf{x_i}^\top\end{pmatrix} \begin{pmatrix}\beta_0\\ \beta\end{pmatrix} $$
Therefore we can rewrite our loss function $L(\beta)$ as:
$$ L(\beta) = \frac{1}{2N}\|\mathbf{y} - \mathbf{X}_{aug}\beta_{aug}\|^2 $$Where $\mathbf{X}_{aug} = \begin{bmatrix}\mathbf{z}_N&\mathbf{X}\end{bmatrix}\in\mathbb{R}^{N\times (D+1)}$ and $\beta_{aug} = \begin{bmatrix}\beta_0\\\beta\end{bmatrix}\in \mathbb{R}^{D+1}$.
Due to the notes, we know that the optimal solution for this problem is:
$$ \beta_{aug}^* = \mathbf{X}_{aug}^\dag \mathbf{y} $$Where $A^\dag := (A^\top A)^{-1}A^\top$ is the Moore-Penrose pseudoinverse of $A$.
As a first step, we now implement a function to augment our matrix $\mathbf{X}$ into $\mathbf{X}_{aug}$ as above:
def augment(X):
"""
Augment matrix X by adding a column of ones to the left of X
Args:
X: (np.array) matrix to augment
Returns:
X_aug: (np.arary) augmented matrix
"""
N, _ = np.shape(X) # get number of samples
X_aug = np.hstack((np.ones((N,1)), X)) # augment X by adding a column of ones to the left of X
return X_aug
We now implement a function to compute the optimal value of $\beta_{aug}$ using the Moore-Penrose pseudoinverse of $\mathbf{X}_a$ given the predictor matrix $\mathbf{X}$ and the predicted variable vector $\mathbf{y}$:
def compute_beta(X, y):
"""
Compute the least squares optimal beta_aug given X and y
Args:
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
Returns:
optimal_beta: (np.array) (D+1)x1 augmented least squares optimal feature vector
"""
X_aug = augment(X) # augment X
optimal_beta = np.linalg.pinv(X_aug) @ y # compute least squares optimal beta
return optimal_beta
beta_star = compute_beta(X_train, y_train) # compute least squares optimal beta
To compute the average $MSE$ errors and the $R^2$ score for the dataset, we first implement functions to do exactly this for the linear model:
def compute_mse(X, y, beta):
"""
Compute the mean squared error of the linear model
Args:
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
beta: (np.array) (D+1)x1 augmented feature vector
Returns:
mse: (float) mean squared error
"""
X_aug = augment(X) # augment X
y_pred = X_aug @ beta # compute predicted y
mse = np.mean((y_pred - y) ** 2) # compute mean squared error
return mse
def compute_R2(X, y, beta):
"""
Compute the R2 of the linear model
Args:
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
beta: (np.array) (D+1)x1 augmented feature vector
Returns:
R2: (float) R2
"""
X_aug = augment(X) # augment X
y_pred = X_aug @ beta # compute predicted y
y_mean = np.mean(y) # compute mean of y
SST = np.sum((y - y_mean) ** 2) # compute total sum of squares
SSE = np.sum((y - y_pred) ** 2) # compute sum of squared errors
R2 = 1 - (SSE / SST) # compute R2
return R2
We now display the standardised $\mathbf{\beta}^*_{aug}$, as this will allow us to compare the features with each other:
augmented_columns = np.concatenate([np.array(["Intercept"]), airfoil_columns]) # get augmented column names
pd.DataFrame(beta_star, index=augmented_columns, columns=["LS Beta"]).transpose() # display restandardised least squares optimal beta
| Intercept | Frequency | Angle | Displacement | Chord length | Velocity | Thickness | |
|---|---|---|---|---|---|---|---|
| LS Beta | 123.970283 | -3.575497 | 0.834478 | -3.928647 | -0.078334 | 1.724957 | -0.013563 |
Lastly, the in-sample $MSE$ an $R^2$ scores are displayed below:
print("In sample MSE: " + str(compute_mse(X_train, y_train, beta_star))) # in sample MSE
print("In sample R2: " + str(compute_R2(X_train, y_train, beta_star))) # in sample R2
In sample MSE: 1.8755566396402015 In sample R2: 0.9177343977263077
We now compute the out-of-sample $MSE$ and $R^2$ errors:
print("Out of sample MSE: " + str(compute_mse(X_test, y_test, beta_star))) # out of sample MSE
print("Out of sample R2: " + str(compute_R2(X_test, y_test, beta_star))) # out of sample R2
Out of sample MSE: 1.9843895771191569 Out of sample R2: 0.9083172459581623
We see that the model preforms slightly worse outside of the training set, as expected as it is near impossible to obtain a perfect model with $0$ generalisation error. The $MSE$ increases by $\approx 0.11$, and the $R^2$ increases by $\approx 0.09$, the latter being more informative as it is scale invariant and thus it quantifies better the decrease in performance when the model is applied outside the training set.
Moreover, we can conclude the parameters don't follow a linear relation, as if they did the generalisation error would be closer to $0$. However, they are almost linearly related (the linear approximation of the true function $f(\beta_i) = \beta_7$ accurately approximates the true function) as the out of sample $R^2$ score is relatively low.
We are tasked to minimise: $$ L_{LASSO}(\beta_0,\beta) = \frac{1}{2N}\|\mathbf{y}-\mathbf{X}\beta - \beta_0\|^2 + \lambda\|\beta\|_1 $$
Augmenting the system gives: $$ L_{LASSO}(\beta_{aug}) = \frac{1}{2N}\|\mathbf{y}-\mathbf{X}_{aug}\beta_{aug}\|^2 + \lambda\left\|\begin{bmatrix} 0 \\ \beta\end{bmatrix}\right\|_1 $$
The gradient w.r.t. $\beta_{aug}$ of the loss then is:
$$ \nabla L_{LASSO}(\beta_{aug}) = \frac{1}{N}(\mathbf{X}_{aug}^\top(\mathbf{X}_{aug}\beta_{aug} - \mathbf{y})) + \lambda\, sgn.\left(\begin{bmatrix} 0 \\ \beta\end{bmatrix}\right) $$Where the sign function $sgn(x)$ is defined as:
$$ sgn(x) = \begin{cases} 1,\, x > 0\\ 0,\, x = 0\\ -1,\, x < 0\\ \end{cases} $$and the "$.$" after $sgn$ indicates the function is applied entrywise. This is implemented below:
def lasso_loss(beta_aug, X, y, lam):
"""
Compute the lasso loss function
Args:
beta_aug: (np.array) (D+1)x1 augmented feature vector
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
lam: (float) lasso penalty parameter
Returns:
loss: (float) lasso loss
"""
X_aug = augment(X) # augment X
return (1/(2*N))*np.linalg.norm((X_aug @ beta_aug - y), 2)**2 + lam * np.linalg.norm(beta_aug[1:],1)
def lasso_loss_grad(beta_aug, X, y, lam):
"""
Compute the gradient of the lasso loss function
Args:
beta_aug: (np.array) (D+1)x1 augmented feature vector
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
lam: (float) lasso penalty parameter
Returns:
loss_grad: (np.array) (D+1)x1 gradient of lasso loss
"""
N, _ = np.shape(X) # get number of samples
X_aug = augment(X) # augment X
return (1/N) * (X_aug.T @ (X_aug@beta_aug-y)) + lam * np.sign(np.vstack([0, beta_aug[1:]]))
We now implement gradient descent for $\beta_{aug}$. Let $\beta_{aug,i}$ be the $\beta_{aug,i}$ computed after $i$ iterations of gradient descent, and $L_{LASSO}^i = L_{LASSO}(\beta_{aug,i})$. We implement gradient descent, where we check for convergence every $2^n$ iterations. Therefore, our gradient descent will stop when one of the following happens:
maxiter (set to $10^4$ by default)threshold $\quad$(where threshold is set to $10^{-5}$ by default)def gradient_descent_lasso(beta_in, X, y, lam, threshold=1e-5, maxiter=10_000):
"""
Perform gradient descent to find the lasso optimal beta_aug
Args:
beta_in: (np.array) (D+1)x1 initial augmented feature vector
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
lam: (float) lasso penalty parameter
threshold: (float) threshold for convergence
max_iter: (int) maximum number of iterations
Returns:
beta_n: (np.array) (D+1)x1 lasso optimal augmented feature vector
"""
beta_n = beta_in # initialise beta
dl_beta_n = lasso_loss_grad(beta_n, X, y, lam) # compute gradient
prev_loss = np.inf # initialise previous loss
loss = lasso_loss(beta_n, X, y, lam) # compute loss
j = 1 # initialise iteration counter
nth = 0 # initialise power of 2 counter
stepsize = (1 / j) # initialise stepsize
while j < maxiter and np.abs(prev_loss - loss)>=threshold*prev_loss:
if j == 2**nth:
# If we have reached a power of 2, we recompute the loss to check
# for convergence
nth += 1 # increment power of 2 counter
prev_loss = lasso_loss(beta_n, X, y, lam) # compute previous loss
beta_n = beta_n - stepsize*dl_beta_n # update beta
dl_beta_n = lasso_loss_grad(beta_n, X, y, lam) # compute new gradient
loss = lasso_loss(beta_n, X, y, lam) # compute new loss
else:
beta_n = beta_n - stepsize*dl_beta_n # update beta
dl_beta_n = lasso_loss_grad(beta_n, X, y, lam) # compute new gradient
j += 1 # increment iteration counter
stepsize = (1 / j) # update stepsize
return beta_n
We now implement the code for cross-validation; we first generate our folds and perform cross validation on said folds for a fixed $LASSO$ regularisation parameter $\lambda$; we will be starting the gradient descent at the least squares optimal $\beta$, $\beta^*$, as we assume the optimal $\beta$ for a given $\lambda$ for LASSO won't stray too far from $\beta^*$ due to the values of $\lambda$ scanned being small.
def generate_folds(X, y, nfolds=5):
"""
Generate folds for cross validation
Args:
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
nfolds: (int) number of folds
Returns:
folds: (dict) dictionary of folds of the form:
{"Fold 1" : {"X_train" : X_training, "y_train" : y_training, "X_val": X_val, "y_val" : y_val}, ...}
"""
# split the data into folds
X_folds = np.split(X, nfolds)
y_folds = np.split(y, nfolds)
folds = {}
for i in range(nfolds):
# generate the training and validation sets
X_val = X_folds[i]
y_val = y_folds[i]
X_training = np.concatenate(X_folds[:i] + X_folds[i+1:])
y_training = np.concatenate(y_folds[:i] + y_folds[i+1:])
# store the training and validation sets in a dictionary
folds["Fold " + str(i)] = {"X_train" : X_training, "y_train" : y_training, "X_val": X_val, "y_val" : y_val}
return folds
def lasso_cross_validation_score(X, y, nfolds, lam, threshold=1e-5, maxiter=10_000):
"""
Compute the cross validation score for a given lambda
Args:
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
nfold: (int) number of folds
lam: (float) lasso penalty parameter
threshold: (float) threshold for convergence
maxiter: (int) maximum number of iterations
Returns:
mse: (float) mean squared error
"""
folds = generate_folds(X, y, nfolds) # generate folds
mses = [] # initialise list of mean squared errors
for fold in folds.values():
X_training = fold["X_train"] # get training data
y_training = fold["y_train"]
X_val = fold["X_val"] # get validation data
y_val = fold["y_val"]
predicted_beta = gradient_descent_lasso(beta_star, X_training, y_training, lam, threshold=threshold, maxiter=maxiter) # compute predicted beta
mses.append(compute_mse(X_val, y_val, predicted_beta)) # compute mean squared error
return np.mean(mses)
We now test a range of lambdas and find the one that minimises $\langle MSE \rangle$ over all folds:
def test_lambdas(X, y, lambdas, nfolds, threshold=1e-5, maxiter=10_000):
"""
Test a range of lambdas and return the optimal lambda
Args:
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
lambdas: (np.array) 1xL array of lambdas to test
nfold: (int) number of folds
threshold: (float) threshold for convergence
maxiter: (int) maximum number of iterations
Returns:
mse_arr: (np.array) Lx2 array of the form [[lambda_1, mse_1], ..., [lambda_L, mse_L]]
optimal_pair: (np.array) 1x2 array of the form [optimal_lambda, optimal_mse] containing the lambda that minimises the MSE and
said MSE
"""
mse_arr = [] # initialise list of mean squared errors
for lam in tqdm(lambdas):
mse_arr.append([lam, lasso_cross_validation_score(X, y, nfolds, lam, threshold=threshold, maxiter=maxiter)]) # compute mean squared error
mse_arr = np.asarray(mse_arr) # convert to numpy array
optimal_pair = mse_arr[np.argmin(mse_arr[:,1])] # get optimal lambda
return np.asarray(mse_arr), optimal_pair # return mse array and optimal lambda
We apply this to our dataset and compute the optimal regularisation parameter for LASSO regression, denoted by $\lambda^*_{LASSO}$. We start by scanning a large lange of $\lambda$ s in order to determine the correct order of magnitude to search for $\lambda_{LASSO}^*$; as this is an initial scan, maxiter was capped to 2000 iterations:
initial_lasso_scan, _ = test_lambdas(X_train, y_train, [1e-3,1e-2,1e-1,1,1e1,1e2,1e3], 5, threshold=1e-3, maxiter=2_000)
100%|██████████| 7/7 [00:23<00:00, 3.33s/it]
plt.plot(initial_lasso_scan[:,0], initial_lasso_scan[:,1], label="$\\langle MSE \\rangle$ over folds", color="black")
plt.xlabel("Regularisation parameter $\lambda$")
plt.ylabel("$\\langle MSE \\rangle$")
plt.title("Initial $\lambda_{LASSO}^*$ Scan")
plt.xscale("log")
plt.legend()
plt.show()
After completing an initial scan, it is clear $\lambda^*_{LASSO}\in[0,0.1]$. We perform a second scan to further increase the resolution, this time using $100$ values between $0$ and $0.1$:
mses_LASSO, optimal_pair_LASSO = test_lambdas(X_train, y_train, np.linspace(0, 0.1, 100), 5, threshold=1e-5, maxiter=10_000)
optimal_lambda_LASSO = optimal_pair_LASSO[0]
100%|██████████| 100/100 [00:51<00:00, 1.94it/s]
We now visualise how $\langle MSE\rangle$ changes when $\lambda$ is varied in LASSO regression:
plt.plot(mses_LASSO[:,0], mses_LASSO[:,1], label="$\\langle MSE \\rangle$ over folds", color="black")
plt.plot(optimal_pair_LASSO[0], optimal_pair_LASSO[1], "*", label="$\lambda^*_{LASSO}$", color="red", markersize=10, alpha=0.5)
plt.xlabel("Regularisation parameter $\lambda$")
plt.ylabel("$\\langle MSE \\rangle$")
plt.title("LASSO cross validation")
plt.legend()
plt.show()
We display $\lambda_{LASSO}^*$ and the optimal value of $\langle MSE \rangle$ for completeness:
print("The optimal lambda for LASSO is: ", optimal_pair_LASSO[0])
print("The optimal average MSE for LASSO is: ", optimal_pair_LASSO[1])
The optimal lambda for LASSO is: 0.00101010101010101 The optimal average MSE for LASSO is: 1.8798973571799167
We now visualise what happens to the components of the feature vector $\beta^*_{LASSO,\,\lambda}$ as we vary our value of lambda; we first define a function to generate $\beta^*_{LASSO,\lambda_{i}}$ given array of $\lambda_i$ values:
def generate_betas(fold, lambdas, threshold=1e-5, maxiter=10_000):
"""
Generate betas for a given fold
Args:
fold: (dict) dictionary of the form:
{"X_train" : X_training, "y_train" : y_training, "X_val": X_val, "y_val" : y_val}
lambdas: (np.array) 1xL array of lambdas to test
maxiter: (int) maximum number of iterations
Returns:
betas: (np.array) Lx(D+1) array of the form [[lambda_1, beta_1], ..., [lambda_L, beta_L]]
"""
X_training = fold["X_train"] # get training predictor matrix from the fold
y_training = fold["y_train"] # get training outcome vector from the fold
betas = []
for lam in tqdm(lambdas):
# compute predicted beta
predicted_beta = gradient_descent_lasso(beta_star, X_training, y_training, lam, threshold=threshold, maxiter=maxiter)
# store the lambda and the predicted beta
betas.append(np.vstack([lam, predicted_beta]))
return np.asarray(betas)
We now fix a fold and generate the betas for said fold.
fixed_fold = generate_folds(X_train, y_train, 5)["Fold 0"]
generated_betas = generate_betas(fixed_fold, np.linspace(0, 3, 51), threshold=1e-5, maxiter=5_000)
100%|██████████| 51/51 [02:29<00:00, 2.94s/it]
We now display how the components of $\beta^*_{LASSO,\,\lambda}$ evolve as $\lambda$ grows. Let $\mathbf{v}^i$ be the $i$ th component of the vector $\mathbf{v}$, using $0$-based indexing (this is so $(\beta_{aug})^0 = \beta_0$); then:
fig, axes = plt.subplots(2, 3, figsize=(12, 8))
for i, ax in enumerate(axes.flatten()):
ax.plot(generated_betas[:,0], generated_betas[:,i+2], color="black")
ax.set_xlabel("$\lambda$")
ax.set_title(f"$\left(\\beta^*_{{LASSO,\,\lambda}}\\right)^{{{i+1}}}$")
ax.set_ylabel("value")
fig.suptitle("Components of $\\beta^*_{LASSO,\,\lambda}$ as a function of $\lambda$ for a fixed fold")
plt.tight_layout()
plt.show()
We can see that as lambda increases, $\left(\beta_{LASSO,\,\lambda}^*\right)^i\to 0$, $i\neq 0$. This is due to the $L_1$-norm term in the LASSO loss function, which induces sparsity in $\beta$. However, we can see some outliers; we see that $\beta_5$ does not tend to $0$ montonically, and that $\beta_7$'s value fluctuates when $\lambda$ reaches around $0.5$; these behaviours can be attributed to numerical artifacts, especially the latter.
This behaviour occurs due to the sparsity-inducing nature of the $L_1$ norm; we note that the LASSO loss function has the following dual formulation: $$ \min_{\beta}\|\mathbf{X}\beta- y\|_2^2 \text{ such that } \|\beta\|_1\leq k $$
For some $k = k(\lambda)$, where the relation is inversely proportional (as $\lambda$ increases, $k$ decreases, not necessarily reciprocally/linearly). Therefore, when $\lambda$ increases and thus $k$ decreases, the predicted features must be closer to the origin and due to the sparsity-inducing property of the $L_1$ norm, $\beta\to\mathbf{0}$. This induces a bias in $\beta$, as $\beta$ is the induced sparsity drives it away from its true optimal value, $\beta^*$, meaning $\mathbb{E}[\beta] - \beta = \beta^*-\beta$ increases. However, the variance is reduced as sparsity decreases variance due to the coefficients of $\beta$ being more localised; this yields reduced variance because as the values of $\beta$ are more constrained as $\lambda$ increases, the possible values $\beta$ may take decreases.
Here, we have omitted the intercept term, as our loss function does not regularise over $\beta_0$. We now plot $\beta_0$:
plt.plot(generated_betas[:,0], generated_betas[:,1], label="$\\beta_1$", color="black")
plt.xlabel("$\lambda$")
plt.ylabel("Value")
plt.title("$\left(\\beta^*_{LASSO,\,\lambda}\\right)^0$")
plt.ylim(-0.1, 150)
plt.show()
As we are not regularising over $\beta_0$, we can see it stays constant regardless of the value of $\lambda$, as expected.
We now find $\beta_{LASSO,\lambda^*}^*$ and compute the in-sample and out-of-sample $MSE$ s and $R^2$ errors:
beta_star_lasso = gradient_descent_lasso(beta_star, X_train, y_train, optimal_lambda_LASSO, 10_000)
print("In sample MSE: " + str(compute_mse(X_train, y_train, beta_star_lasso))) # in sample MSE
print("In sample R2: " + str(compute_R2(X_train, y_train, beta_star_lasso))) # in sample R2
print()
print("Out of sample MSE: " + str(compute_mse(X_test, y_test, beta_star_lasso))) # out of sample MSE
print("Out of sample R2: " + str(compute_R2(X_test, y_test, beta_star_lasso))) # out of sample R2
In sample MSE: 1.8755614954545592 In sample R2: 0.9177341847407415 Out of sample MSE: 1.9838701234315852 Out of sample R2: 0.9083412457539801
Once again, we observe that the model performs better out of sample than in sample. We now discuss the differences between $\beta^*$ and $\beta_{LASSO}^*$:
print("The difference between the LASSO beta and the true beta is: ")
print(np.abs(beta_star_lasso-beta_star))
print()
print("The L1 norm of the LASSO optimal beta is: {}".format(np.linalg.norm(beta_star_lasso, ord=1)))
print("The L1 norm of the least squares optimal beta is: {}".format(np.linalg.norm(beta_star, ord=1)))
print()
print("As a reminder, the optimal value of lambda for LASSO is: {}".format(optimal_lambda_LASSO))
print()
The difference between the LASSO beta and the true beta is: [[1.42108547e-14] [1.01010101e-03] [1.01010101e-03] [1.01010101e-03] [1.01010101e-03] [1.01010101e-03] [1.01010101e-03]] The L1 norm of the LASSO optimal beta is: 134.1196991219072 The L1 norm of the least squares optimal beta is: 134.12575972796782 As a reminder, the optimal value of lambda for LASSO is: 0.00101010101010101
We start by noting that the small difference in $\beta_0$ is due to the regularisation not affecting $\beta_0$, thus the intercept terms of both $\beta^*$ and $\beta^*_{LASSO}$ remain identical up to a very small precision that can be attributed up to numerical error.
We note that all other terms in $\beta^*$ and $\beta^*_{LASSO}$ differ by exactly $\lambda^*_{LASSO}$.
Moreover, we can see that the $L_1$ norm of $\beta_{LASSO}^*$ is less than the $\beta^*$, as expected, as we LASSO penalises $\beta$ having a large $L_1$ norm and thus forces $\|\beta\|_1$ to decrease.
Lastly, we recall our $MSE$ and $R^2$ scores for $\beta^*$ and $\beta_{LASSO}^*$:
print("In sample MSE (LS Beta): " + str(compute_mse(X_train, y_train, beta_star))) # in sample MSE
print("In sample R2 (LS Beta): " + str(compute_R2(X_train, y_train, beta_star))) # in sample R2
print()
print("Out of sample MSE (LS Beta): " + str(compute_mse(X_test, y_test, beta_star))) # in sample MSE
print("Out of sample R2 (LS Beta): " + str(compute_R2(X_test, y_test, beta_star))) # in sample R2
print("------------------------------------")
print("In sample MSE (LASSO Beta): " + str(compute_mse(X_train, y_train, beta_star_lasso))) # in sample MSE
print("In sample R2 (LASSO Beta): " + str(compute_R2(X_train, y_train, beta_star_lasso))) # in sample R2
print()
print("Out of sample MSE (LASSO Beta): " + str(compute_mse(X_test, y_test, beta_star_lasso))) # out of sample MSE
print("Out of sample R2 (LASSO Beta): " + str(compute_R2(X_test, y_test, beta_star_lasso))) # out of sample R2
In sample MSE (LS Beta): 1.8755566396402015 In sample R2 (LS Beta): 0.9177343977263077 Out of sample MSE (LS Beta): 1.9843895771191569 Out of sample R2 (LS Beta): 0.9083172459581623 ------------------------------------ In sample MSE (LASSO Beta): 1.8755614954545592 In sample R2 (LASSO Beta): 0.9177341847407415 Out of sample MSE (LASSO Beta): 1.9838701234315852 Out of sample R2 (LASSO Beta): 0.9083412457539801
We can see that the in-sample $MSE$ is smaller and thus better for $\beta^*$. Similarly, in-sample $R^2$ is greater and better for $\beta^*$ than for $\beta^*_{LASSO}$. However, we note that $\beta^*_{LASSO}$ performs better out-of-sample, as it has lower out-of-sample $MSE$ and higher out-of-sample $R^2$ than $\beta^*$; this agrees with our expectations as LASSO introduces bias into our $\beta$ to reduce its variance in order to diminish its generalisation error, which it manages to do in this instance.
We now train a model using elastic net regularisation, whose loss function is given by:
$$ L_{EN}(\beta_0,\beta) = \frac{1}{2N}\|\mathbf{y}-\mathbf{X}\beta - \beta_0 \|^2 + \lambda(\alpha\|\beta\|_1 + (1-\alpha)\|\beta\|_2^2) $$Augmenting $\mathbf{X}$ yields:
$$ L_{EN}(\beta_{aug}) = \frac{1}{2N}\|\mathbf{y}-\mathbf{X}_{aug}\beta_{aug}\|^2 + \lambda\left(\alpha\left\|\begin{bmatrix} 0 \\ \beta\end{bmatrix}\right\|_1 + (1-\alpha)\left\|\begin{bmatrix} 0 \\ \beta\end{bmatrix}\right\|_2^2\right) $$We differentiate w.r.t. $\beta_{aug}$ to find the gradient to do gradient descent. This yields:
$$ \nabla_{\beta_{aug}}L_{EN}(\beta_{aug}) = \frac{1}{N}(\mathbf{X}_{aug}^\top(\mathbf{X}_{aug}\beta_{aug} - \mathbf{y})) + \lambda\left( \alpha\, sgn.\left(\begin{bmatrix} 0 \\ \beta\end{bmatrix}\right) + (1-\alpha)\begin{bmatrix} 0 \\ \beta\end{bmatrix}\right) $$This is implemented below:
def en_loss(beta_aug, X, y, lam, alpha):
"""
Compute the elastic net loss function
Args:
beta_aug: (np.array) (D+1)x1 augmented feature vector
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
lam: (float) elastic net penalty parameter
alpha: (float) elastic net mixing parameter
Returns:
loss: (float) elastic net loss
"""
N, _ = np.shape(X) # get the number of observations
X_aug = augment(X) # augment the predictor matrix
# compute the regularisation term
reg_term = lam * (alpha * np.linalg.norm(np.vstack([0, beta_aug[1:]]), ord=1) + (1-alpha) * np.linalg.norm(np.vstack([0, beta_aug[1:]]), ord=2)**2)
return (1/(2*N)) * np.linalg.norm(y-X_aug@beta_aug)**2 + reg_term
def en_loss_grad(beta_aug, X, y, lam, alpha):
"""
Compute the gradient of the elastic net loss function
Args:
beta_aug: (np.array) (D+1)x1 augmented feature vector
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
lam: (float) elastic net penalty parameter
alpha: (float) elastic net mixing parameter
Returns:
loss_grad: (np.array) (D+1)x1 gradient of elastic net loss
"""
N, _ = np.shape(X) # get the number of observations
X_aug = augment(X) # augment the predictor matrix
return (1/N) * (X_aug.T @ (X_aug@beta_aug-y)) + lam * (alpha * np.sign(np.vstack([0, beta_aug[1:]])) + (1-alpha) * np.vstack([0, beta_aug[1:]]))
We now implement gradient descent for $\beta_{aug}$ for the Elastic Net loss function. Using the same notation as in section 1.2.1, our gradient descent will stop when one of the following happens:
maxiter (set to $10^4$ by default)threshold $\quad$ (where threshold is set to $10^{-5}$ by default)def gradient_descent_en(beta_in, X, y, lam, alpha, threshold = 1e-5, maxiter=10_000):
"""
Perform gradient descent to find the elastic net optimal beta_aug
Args:
beta_in: (np.array) (D+1)x1 initial augmented feature vector
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
lam: (float) elastic net penalty parameter
alpha: (float) elastic net mixing parameter
threshold: (float) threshold for convergence
maxiter: (int) maximum number of iterations
Returns:
beta_n: (np.array) (D+1)x1 elastic net optimal augmented feature vector
"""
beta_n = beta_in # initialise beta
dl_beta_n = en_loss_grad(beta_n, X, y, lam, alpha) # compute gradient
prev_loss = np.inf # initialise previous loss
loss = en_loss(beta_n, X, y, lam, alpha) # compute loss
j = 1 # initialise iteration counter
nth = 0 # initialise stepsize counter
stepsize = (1 / j) # initialise stepsize
while j < maxiter and np.abs(prev_loss - loss)>=prev_loss*threshold: # check convergence
if j == 2**nth: # check if stepsize should be updated
nth += 1 # update stepsize counter
prev_loss = en_loss(beta_n, X, y, lam, alpha) # compute previous loss
beta_n = beta_n - stepsize*dl_beta_n # update beta
dl_beta_n = en_loss_grad(beta_n, X, y, lam, alpha) # compute new gradient
loss = en_loss(beta_n, X, y, lam, alpha) # compute new loss
else:
beta_n = beta_n - stepsize*dl_beta_n # update beta
dl_beta_n = en_loss_grad(beta_n, X, y, lam, alpha) # compute new gradient
j += 1 # update iteration counter
stepsize = (1 / j) # update stepsize
return beta_n
We now perform grid search over $\alpha = 0.1,0.5, 0.9 $ and $\lambda\in[0,0.01]$; we first implement functions to:
Once again, we initialise $\beta$ to the optimal least squares $\beta$, $\beta^*$, as the assumption that the regularisation won't move the value of $\beta$ far away from $\beta^*$ can still be made.
def en_cross_validation_score(X, y, nfolds, lam, alpha, threshold=1e-5, maxiter=10_000):
"""
Compute the mean squared error of the elastic net model using cross validation
Args:
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
nfolds: (int) number of folds
lam: (float) elastic net penalty parameter
alpha: (float) elastic net mixing parameter
threshold: (float) threshold for convergence
maxiter: (int) maximum number of iterations
Returns:
mse: (float) mean squared error of the elastic net model
"""
folds = generate_folds(X, y, nfolds) # generate folds
mses = [] # initialise list of mean squared errors
for fold in folds.values(): # iterate over folds
X_training = fold["X_train"] # extract training data
y_training = fold["y_train"] # extract training labels
X_val = fold["X_val"] # extract validation data
y_val = fold["y_val"] # extract validation labels
predicted_beta = gradient_descent_en(beta_star, X_training, y_training, lam, alpha, threshold=threshold, maxiter=maxiter) # compute optimal beta
mses.append(compute_mse(X_val, y_val, predicted_beta)) # compute mean squared error and append it to the list
return np.mean(mses) # return the mean of the mean squared errors
def en_test_lambdas(X, y, lambdas, alphas, nfolds, threshold=1e-5, maxiter=10_000):
"""
Compute the mean squared error of the elastic net model using cross validation for different lambdas
Args:
X: (np.array) NxD predictor matrix
y: (np.array) Nx1 predicted variable vector
lambdas: (np.array) Lx1 array of lambdas
alphas: (np.array) Ax1 array of alphas
nfolds: (int) number of folds
threshold: (float) threshold for convergence
maxiter: (int) maximum number of iterations
Returns:
mse_arrs: (np.array) AxLx2 array of mean squared errors for each alpha and lambda
optimal_pairs: (np.array) Ax3 array of optimal alpha and lambda pairs
"""
mse_arrs = [] # initialise list of mean squared errors for each alpha and lambda
optimal_pairs = [] # initialise list of optimal alpha and lambda pairs
for alpha in tqdm(alphas): # iterate over alphas
mse_arr = [] # initialise list of mean squared errors for each lambda
for lam in tqdm(lambdas): # iterate over lambdas
mse_arr.append([lam, en_cross_validation_score(X, y, nfolds, lam, alpha, threshold=threshold, maxiter=maxiter)]) # compute mean squared error and append it to the list
mse_arr = np.asarray(mse_arr) # convert list to array
mse_arrs.append([alpha, mse_arr]) # append array to list
optimal_pair = mse_arr[np.argmin(mse_arr[:,1])] # compute optimal lambda
optimal_pairs.append(np.concatenate([[alpha],optimal_pair])) # append optimal lambda to list
return np.asarray(mse_arrs, dtype=object), np.asarray(optimal_pairs) # return arrays
We now compute $\lambda^*_{0.1},\lambda^*_{0.5}$ and $\lambda^*_{0.9}$:
mse_arrs, pairs = en_test_lambdas(X_train, y_train, np.linspace(0, 0.01, 51),\
[0.1,0.5,0.9], 5, threshold=1e-6, maxiter=5_000) # compute mean squared errors for different lambdas
100%|██████████| 51/51 [00:43<00:00, 1.17it/s] 100%|██████████| 51/51 [00:27<00:00, 1.86it/s] 100%|██████████| 51/51 [00:07<00:00, 7.06it/s] 100%|██████████| 3/3 [01:18<00:00, 26.08s/it]
We display the pairs $(\alpha, \lambda_{\alpha}^*, \langle MSE\rangle^*_{\alpha})$, where $\langle MSE\rangle^*_{\alpha}$ is the best average $MSE$ over the folds for a given $\alpha$:
for i in range(3): # iterate over alphas
print("alpha = ", pairs[i,0]) # print alpha
print("optimal lambda = ", pairs[i,1]) # print optimal lambda
print("optimal mse = ", pairs[i,2]) # print optimal mse
print()
alpha = 0.1 optimal lambda = 0.0002 optimal mse = 1.879952681679405 alpha = 0.5 optimal lambda = 0.0004 optimal mse = 1.8799601083775783 alpha = 0.9 optimal lambda = 0.0012000000000000001 optimal mse = 1.8799620776179111
The results above are shown in the plot below; in the following graph, let $\langle MSE\rangle_\alpha$ be the $\langle MSE\rangle$ curve associated with the elastic net mixing parameter $\alpha$.
colors = ["black", "red", "grey"]
for i in tqdm(range(len(mse_arrs))):
alpha = mse_arrs[i][0] # extract alpha
mses_alphas = mse_arrs[i][1] # extract mean squared errors
plt.plot(mses_alphas[:,0], mses_alphas[:,1], \
label="$\langle MSE\\rangle_{{{alpha}}}$".format(alpha=alpha), color=colors[i]) # plot mean squared errors
plt.plot(pairs[i,1], pairs[i,2], \
marker="*", label="$\lambda^*_{{{alpha}}}$".format(alpha=alpha), color=colors[i], zorder=1000, markersize=10) # plot optimal lambda
plt.xlabel("Regularisation parameter $\lambda$") # set x label
plt.ylabel("$\langle MSE\\rangle$") # set y label
plt.title("$\langle MSE\\rangle$ for different $\lambda$ and $\\alpha$ values")
plt.legend()
plt.show()
100%|██████████| 3/3 [00:00<00:00, 170.99it/s]
We display $\lambda^*_{0.1}, \lambda^*_{0.5}, \lambda^*_{0.9}$
print("For Alpha = 0.1, the optimal lambda is", pairs[0,1])
print("For Alpha = 0.5, the optimal lambda is", pairs[1,1])
print("For Alpha = 0.9, the optimal lambda is", pairs[2,1])
For Alpha = 0.1, the optimal lambda is 0.0002 For Alpha = 0.5, the optimal lambda is 0.0004 For Alpha = 0.9, the optimal lambda is 0.0012000000000000001
Let $\beta^*_{EN,\alpha}$ be the augmented feature vector that minimises the $\langle MSE \rangle$ across all cross-validation folds for a fixed $\alpha$ (using the notation from 1.2, this can also be expressed as $\beta^*_{EN,\lambda^*_\alpha}$)
Having $\lambda_{0.1}^*,\lambda_{0.5}^*$ and $\lambda_{0.9}^*$, we now compute $\beta^*_{EN,\alpha}$ on the whole train set and use this to calculate the out-of-sample $MSE$ and $R^2$ scores for $\alpha=0.1,0.5,0.9$
beta_stars_en = [] # initialise list of optimal betas for the alpha range
for alpha, lam, _ in pairs:
print("Alpha = " + str(alpha))
beta_star_en = gradient_descent_en(beta_star, X_train, y_train, lam, alpha, 5_000) # compute optimal beta
beta_stars_en.append(np.vstack([alpha,beta_star_en])) # append optimal beta to list
print("Out of sample MSE: " + str(compute_mse(X_test, y_test, beta_star_en))) # out of sample MSE
print("Out of sample R2: " + str(compute_R2(X_test, y_test, beta_star_en))) # out of sample R2
print()
Alpha = 0.1 Out of sample MSE: 1.9843642929376566 Out of sample R2: 0.9083184141377478 Alpha = 0.5 Out of sample MSE: 1.9842699947212405 Out of sample R2: 0.9083227709033154 Alpha = 0.9 Out of sample MSE: 1.9838262587927602 Out of sample R2: 0.9083432723877313
Based on these metrics, one can see that $\alpha=0.9$ yields the best model, as it has the lowest out-of-sample $MSE$ and highest $R^2$ score. Afterwards $\alpha=0.5$ gives the second best model, and $\alpha=0.1$ yields the worst model.
We now widen our range of $\alpha$ s used in our grid search and find $\lambda_\alpha^*$ for each $\alpha$; in our case, $\alpha = 0,0.2,0.4, 0.5, 0.6,0.8, 1$
alphas = [0,0.2,0.4,0.5,0.6,0.8,1]
full_mse_arrs, full_pairs = en_test_lambdas(X_train, y_train, np.linspace(0, 0.01, 101), alphas, 5, threshold=1e-6, maxiter=5_000)
100%|██████████| 101/101 [01:33<00:00, 1.08it/s] 100%|██████████| 101/101 [01:16<00:00, 1.32it/s] 100%|██████████| 101/101 [00:51<00:00, 1.95it/s] 100%|██████████| 101/101 [00:41<00:00, 2.46it/s] 100%|██████████| 101/101 [00:30<00:00, 3.28it/s] 100%|██████████| 101/101 [00:20<00:00, 5.02it/s] 100%|██████████| 101/101 [00:09<00:00, 10.75it/s] 100%|██████████| 7/7 [05:23<00:00, 46.26s/it]
We display $\alpha, \lambda^*_\alpha$ and $\langle MSE \rangle ^*_\alpha$:
for i in range(len(full_mse_arrs)):
print("Alpha = ", full_pairs[i,0])
print("Optimal Lambda = ", full_pairs[i,1])
print("Optimal MSE = ", full_pairs[i,2])
print()
Alpha = 0.0 Optimal Lambda = 0.0002 Optimal MSE = 1.8799605950559788 Alpha = 0.2 Optimal Lambda = 0.0002 Optimal MSE = 1.8799583588620206 Alpha = 0.4 Optimal Lambda = 0.00030000000000000003 Optimal MSE = 1.8799601913778765 Alpha = 0.5 Optimal Lambda = 0.00030000000000000003 Optimal MSE = 1.8799521713375529 Alpha = 0.6 Optimal Lambda = 0.0004 Optimal MSE = 1.8799598898584293 Alpha = 0.8 Optimal Lambda = 0.0007 Optimal MSE = 1.8799598284563057 Alpha = 1.0 Optimal Lambda = 0.0008 Optimal MSE = 1.8799640091972871
We now recalculate $\beta^*_{EN,\alpha}$ on the whole train set for the values of $\alpha$ previously stated.
full_beta_stars_en = [] # initialise list of optimal betas for the extended alpha range
for alpha_v, lam_v, _ in tqdm(full_pairs):
full_beta_star_en = gradient_descent_en(beta_star, X_train, y_train, lam_v, alpha_v, maxiter=10_000) # compute optimal beta
full_beta_stars_en.append(np.vstack([alpha_v,full_beta_star_en])) # append optimal beta to list
full_beta_stars_en = np.asarray(full_beta_stars_en).squeeze() # convert list to array
100%|██████████| 7/7 [00:00<00:00, 562.12it/s]
We now visualise the evolution of the components of $\beta^*_{EN,\alpha}$ as $\alpha$ changes from $0$ to $1$. As before, $\left(\beta^*_{EN,\alpha}\right)^0$ has been excluded as it stays constant due to the Elastic Net loss function not regularising over $\left(\beta^*_{EN,\alpha}\right)^0$.
alphas = np.asarray(alphas)
alpha_indices = np.in1d(alphas, [0,0.5,1])
fig, axes = plt.subplots(2, 3, figsize=(12, 8))
for i, ax in enumerate(axes.flatten()):
#if full_beta_stars_en[:,0][i] == 0
ax.plot(full_beta_stars_en[:,0], full_beta_stars_en[:,i+2], label="beta " + str(i+2), color="black")
ax.scatter(full_beta_stars_en[:,0][alpha_indices], full_beta_stars_en[:,i+2][alpha_indices], color="red", zorder=1000, s=10)
ax.set_xlabel("$\\alpha$")
ax.set_ylabel("Value")
ax.set_title("$\left(\\beta^*_{{EN,\\alpha}}\\right)^{{{comp}}}$".format(comp = i+1))
plt.suptitle("Elastic Net Coefficients")
plt.tight_layout()
plt.show()
Before starting this discussion, it is important to point out the difference between shrinkage and sparsity; whilst they both follow the same idea, that of feature selection and decreasing the values of features, shrinkage does so in a smooth manner, whilst sparsity achieves it in a non-smooth and thus more abrupt fashion.
We can see that, generally $|(\beta^*_{EN, \alpha})^i|$ decreases when $\alpha$ goes from $0$ to $1$; this is due to the $L_1$ norm's sparsity inducing property and the $L_2$'s shrinkage inducing property mixing in the elastic net loss function.As $\alpha$ goes from $0 \to 1$, by inspecting the formula we see we give more weighting to the $L_1$ norm, thus increasing the amount of sparsity induced into $\beta$ and reducing the shrinkage, making $\beta$'s coefficients decrease in absolute value.
When $\alpha = 0$, the Elastic Net loss function turns into the Ridge loss function, and thus the coefficients experience shrinkage instead of sparsity.
When $\alpha = 0.5$, the loss function contains an equal amount of Ridge and LASSO regularisations, and thus we expect the value of $\beta^*_{EN, \alpha}$ to be an interpolation between $\beta^*_{LASSO, \lambda^*_\alpha}$ and $\beta^*_{Ridge, \lambda^*_\alpha}$, the optimal feature vector obtained using Ridge regularisation and regularisation parameter $\lambda^*_\alpha$; this is observed in the plot above.
When $\alpha=1$, the Elastic Net loss function turns into the LASSO loss function, and thus the coefficients become sparse and their value decreases more towards $0$; this is what is observed, as the components $(\beta^*_{EN,\alpha})^i$ obtain their minimal value at $\alpha = 1$.
Lastly, we can see the plot above doesn't look smooth and presents many jagged edges and rough paths - these are numerical artifacts generated by the gradient descent function, as the Elastic Net gradient is non-differentiable and thus the stopping criterion used, which is an approximation to the subgradient of the Elastic Net, will yield irregularities in the final results due to the Elastic Net's loss function's nondifferentiable nature.
We are tasked to train a k-Nearest Neighbour (kNN) regression model on our airfoil dataset. We give a brief reminder of the $k$-Nearest Neighbour algorithm for regression. The aim of this algorithm is to find predict an outcome $y$ given a predictor vector $\mathbf{x}$ by looking at points in a neighbourhood of $\mathbf{x}$. This is done by choosing the $k$ closest points (typically, the Euclidean distance is used, but other metrics may be employed) to $\mathbf{x}$ and computing the average value of the outcome variable $y$ across these $k$ points. The algorithm may be summarised as follows:
For kNN, it is important to standardise the input data $\mathbf{X}$, as if one feature's (denoted $\mathbf{X_{:,i}}$) differences varies more than the rest, it will dominate the Euclidean distance and thus one may fail to capture points that are close to $\mathbf{x}_{in}$ due to its distance being numerically high due to their difference in $\mathbf{X_{:,i}}$ being numerically high, when in fact they are relatively close w.r.t. $\mathbf{X_{:,i}}$. Therefore, we aim to give equal weighting to all of the features, and standardising is crucial to doing so.
We start by defining an Euclidean distance function and a function that returns the $k$ closest points to a given input point.
def euclidean_distance(p, q):
"""
Computes the euclidean distance between two points p and q.
Args:
p (np.array): A point in R^n
q (np.array): A point in R^n
Returns:
dist (float): The euclidean distance between p and q
"""
dist = np.linalg.norm(p - q,2, axis=1)
return dist
def k_neighbours(X_in_train, X_in_test, k=5, return_distance=False):
"""
Finds the k nearest neighbours of each point in X_test in X_train.
Args:
X_train (np.array): A matrix of shape (n_train, n_features)
X_test (np.array): A matrix of shape (n_test, n_features)
k (int): The number of neighbours to find
return_distance (bool): If True, the function returns the distances
between the points in X_test and their k nearest neighbours in X_train
Returns:
indices_arr (np.array): A matrix of shape (n_test, k) containing the indices of the
k nearest neighbours of each point in X_test in X_train
distances_arr (np.array): A matrix of shape (n_test, k) containing the distances between
the points in X_test and their k nearest neighbours in X_train
(only returned if return_distance is True)
"""
dist = []
neigh_ind = []
# compute the euclidean distance between each point in X_test and each point in X_train
point_dist = [euclidean_distance(x_test, X_in_train) for x_test in X_in_test]
for row in point_dist:
enum_neigh = enumerate(row)
sorted_neigh = sorted(enum_neigh, key=lambda x: x[1])[:k] # sort the distances and take the k smallest
ind_list = [tup[0] for tup in sorted_neigh] # get the indices of the k smallest distances
dist_list = [tup[1] for tup in sorted_neigh] # get the k smallest distances
dist.append(dist_list) # append the k smallest distances to the list of distances
neigh_ind.append(ind_list) # append the indices of the k smallest distances to the list of indices
indices_arr = np.array(neigh_ind) # convert the list of indices to an array
if return_distance: # if return_distance is True
distances_arr = np.array(dist) # convert the list of distances to an array
return distances_arr, indices_arr # return the array of distances and the array of indices
return indices_arr # return the array of indices
def kNN_predict(X_in_train, y_in_train, X_in_test, k=5):
"""
Predicts the output of each point in X_test using the k nearest neighbours
in X_train.
Args:
X_train (np.array): A matrix of shape (n_train, n_features)
y_train (np.array): A vector of shape (n_train,)
X_test (np.array): A matrix of shape (n_test, n_features)
k (int): The number of neighbours to use
Returns:
y_pred (np.array): A vector of shape (n_test,) containing the predicted output
of each point in X_test
"""
neighbours = k_neighbours(X_in_train, X_in_test, k=k) # get the indices of the k nearest neighbours
y_pred = np.mean(y_in_train[neighbours], axis=1) # compute the mean of the outputs of the k nearest neighbours
return y_pred
We now implement k-fold cross-validation for kNN:
def kNN_cross_validation_score(X, y, nfolds, k=5):
"""
Computes the cross validation score for a kNN model using the MSE.
Args:
X (np.array): A matrix of shape (n_samples, n_features)
y (np.array): A vector of shape (n_samples,)
nfolds (int): The number of folds to use
k (int): The number of neighbours to use
Returns:
avg_mse (float): The cross validation score
"""
folds = generate_folds(X, y, nfolds) # generate the folds
mse = []
for fold in folds.values():
# get the training and validation sets for the current fold
X_train_fold = fold["X_train"]
y_train_fold = fold["y_train"]
X_val_fold = fold["X_val"]
y_val_fold = fold["y_val"]
# make predictions on the validation set
y_pred = kNN_predict(X_train_fold, y_train_fold, X_val_fold, k)
# compute the MSE and append it to the list of MSEs
mse.append(np.mean((y_pred - y_val_fold)**2))
# compute the average MSE over all folds
avg_mse = np.mean(mse)
return avg_mse
def optimise_k(X, y, nfolds, ks, return_mse=False):
"""
Finds the optimal number of neighbours for a kNN model using cross
validation.
Args:
X (np.array): A matrix of shape (n_samples, n_features)
y (np.array): A vector of shape (n_samples,)
nfolds (int): The number of folds to use
ks (list): A list of integers containing the values of k to try
return_mse (bool): If True, the function returns the list of MSEs
corresponding to each value of k
Returns:
best_k (int): The optimal number of neighbours
mse (list): A list of floats containing the MSEs corresponding to
each value of k (only returned if return_mse is True)
"""
mse = []
for k in tqdm(ks):
# compute the MSE for the current value of k
mse.append(kNN_cross_validation_score(X, y, nfolds, k))
# find the value of k that minimises the MSE
best_k = ks[np.argmin(mse)]
if return_mse: # if return_mse is True
return best_k, mse # return the list of MSEs
return best_k # return the best value of k
We now compute the optimal value of $k$, $k^*$, for this regression task. Additionally, we return the $\langle MSE \rangle$ for each value of $k$, denoted as $\langle MSE \rangle_k$ to better visualise why $k^*$ is the best value of $k$. We test $k\in\{1,2,\dots,30\}$. To do this, we perform $5$-fold cross validation on our training dataset:
optimal_k, mses = optimise_k(X_train, y_train, 5, [i for i in range(1, 31)], return_mse=True)
100%|██████████| 30/30 [09:17<00:00, 18.57s/it]
print("For kNN regression, the optimal number of neighbours is {}.".format(optimal_k))
For kNN regression, the optimal number of neighbours is 7.
We now plot $k$ against $\langle MSE\rangle_k$:
plt.plot([i for i in range(1, 31)], mses, label="$\langle MSE\\rangle$", color="black")
plt.plot(optimal_k, min(mses), '*', label="$k^*$", color="red", markersize=10)
plt.xlabel("$k$")
plt.ylabel("$\langle MSE\\rangle $")
plt.legend()
plt.title("$\langle MSE\\rangle$ vs $k$")
plt.show()
predicted_ys = kNN_predict(X_train, y_train, X_test, optimal_k)
oos_mse = np.mean((predicted_ys-y_test)**2)
print("The out-of-sample MSE for kNN regression is {}.".format(oos_mse))
The out-of-sample MSE for kNN regression is 2.989671223395431.
We can see that the out-of-sample $MSE$ for the optimal value of $k$ found through $5$-fold cross-validation, $k^*$, is significantly higher than the out-of-sample $MSE$ of the linear regression model & its variants.
This supports the claim that a linear model is an adequate descriptor of the relationship between predictors and outcomes, as linear models perform better than kNN, which is local, nonparametric and doesn't take in account linear relationships between predictors and outcomes.
2.1.1
We are tasked to train a random forest classifier on a diabetes dataset using cross-entropy as the information criterion for the decision tree splits. We give a brief reminder on how random forests work:
Essentially, random forests are collections of decision trees that each predict an outcome, and in the end these outcomes are aggregated using a majority rule (if the majority of the trees classified $\mathbf{x}$ in $1$, the random forest will classify $\mathbf{x}$). Therefore, we turn our attention to decision trees.
Decision trees will split the sample space according to an information measure, such as GINI index, or in our case, Cross-Entropy. Decision trees sequentially choose the best value $s$ of a given feature $d$ that maximises the information measure after splitting the sample space by partitioning it into $\{\mathbf{x}: s \geq d\}$ and $\{\mathbf{x}: s < d\}$, until a desired precision is achieved.
To start our classification task, we first start by loading the data:
classification_train = pd.read_csv("data/diabetes_samples.csv")
classification_test = pd.read_csv("data/diabetes_test.csv")
We examine both the train and test sets to see if we have to clean the data:
classification_train.head() # display the first 5 rows of the training set
| patient_number | cholesterol | glucose | hdl_chol | chol_hdl_ratio | age | height | weight | bmi | systolic_bp | diastolic_bp | waist | hip | waist_hip_ratio | diabetes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 115 | 224 | 85 | 30 | 7,5 | 36 | 69 | 205 | 30,3 | 150 | 99 | 37 | 41 | 0,9 | No diabetes |
| 1 | 318 | 194 | 95 | 36 | 5,4 | 63 | 58 | 210 | 43,9 | 140 | 100 | 44 | 53 | 0,83 | No diabetes |
| 2 | 73 | 207 | 75 | 44 | 4,7 | 30 | 72 | 180 | 24,4 | 118 | 62 | 35 | 41 | 0,85 | No diabetes |
| 3 | 69 | 144 | 81 | 28 | 5,1 | 30 | 72 | 165 | 22,4 | 118 | 78 | 31 | 38 | 0,82 | No diabetes |
| 4 | 326 | 181 | 177 | 24 | 7,5 | 64 | 71 | 225 | 31,4 | 130 | 66 | 44 | 47 | 0,94 | Diabetes |
We test the type of the decimal entries, for example, bmi, to see if it's formatted as a float or a str:
type(classification_test.head()["bmi"][0]) # check the type of the bmi column
str
We observe the data must be cleaned by doing two things:
The decimals in this dataset are separated by a ",", instead of a ".". This means that these values, which should be interpreted as floats are read as str, meaning that we cannot perform arithmetic operations with them. Therefore, they must be changed to floats in order to be usable.
Our predicted variable has entries $\{\text{Diabetes}, \text{No diabetes}\}$. These must be mapped into 1 and 0 for classification.
Thankfully, using pd.read's decimal keyword fixes 1., and using the .map method which is built into pandas DataFrames, we can map $\text{Diabetes}\to 1$, $\text{No diabetes}\to 0$, respectively.
Moreover, note that one of the variables we may use as a predictor in our dataset is patient number. However, it is obviously clear that a patient's number has nothing to do with the likelihood of them having diabetes or not, and for that reason we drop this column using pandas' .drop method. All of this is done below:
# read the training data with the correct decimal separator
classification_train = pd.read_csv("data/diabetes_samples.csv", decimal=",")
classification_test = pd.read_csv("data/diabetes_test.csv", decimal=",")
# convert the diabetes column to a binary variable and drop the patient_number column
diabetes = {"No diabetes": 0, "Diabetes": 1}
class_X_train = classification_train.drop(["diabetes", "patient_number"], axis=1)
class_y_train = classification_train["diabetes"].map(diabetes)
class_X_test = classification_test.drop(["diabetes", "patient_number"], axis=1)
class_y_test = classification_test["diabetes"].map(diabetes)
We perform some exploratory data analysis by displaying some statistics of our training data:
classification_train.describe() # get some descriptive statistics
| patient_number | cholesterol | glucose | hdl_chol | chol_hdl_ratio | age | height | weight | bmi | systolic_bp | diastolic_bp | waist | hip | waist_hip_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 273.000000 | 273.000000 | 273.000000 | 273.000000 | 273.000000 | 273.000000 | 273.000000 | 273.000000 | 273.000000 | 273.000000 | 273.000000 | 273.000000 | 273.000000 | 273.000000 |
| mean | 199.695971 | 208.868132 | 108.864469 | 50.531136 | 4.493407 | 47.622711 | 66.032967 | 177.424908 | 28.690842 | 136.772894 | 82.553114 | 37.941392 | 43.102564 | 0.881062 |
| std | 115.880545 | 45.051737 | 55.762363 | 17.041255 | 1.573433 | 17.110598 | 3.867618 | 40.460071 | 6.560658 | 22.621073 | 13.161257 | 5.744903 | 5.834459 | 0.072060 |
| min | 1.000000 | 78.000000 | 52.000000 | 12.000000 | 2.000000 | 19.000000 | 55.000000 | 99.000000 | 17.200000 | 98.000000 | 50.000000 | 26.000000 | 32.000000 | 0.700000 |
| 25% | 96.000000 | 179.000000 | 81.000000 | 39.000000 | 3.200000 | 34.000000 | 63.000000 | 151.000000 | 23.800000 | 121.000000 | 74.000000 | 33.000000 | 39.000000 | 0.830000 |
| 50% | 198.000000 | 204.000000 | 90.000000 | 46.000000 | 4.200000 | 45.000000 | 66.000000 | 174.000000 | 27.900000 | 136.000000 | 82.000000 | 37.000000 | 42.000000 | 0.880000 |
| 75% | 306.000000 | 232.000000 | 108.000000 | 59.000000 | 5.400000 | 61.000000 | 69.000000 | 200.000000 | 32.200000 | 148.000000 | 90.000000 | 42.000000 | 46.000000 | 0.930000 |
| max | 390.000000 | 404.000000 | 385.000000 | 120.000000 | 12.200000 | 92.000000 | 76.000000 | 308.000000 | 50.500000 | 250.000000 | 118.000000 | 53.000000 | 64.000000 | 1.090000 |
We now visualise our data using a pairplot:
sns.pairplot(classification_train, hue="diabetes", corner=True);
plt.suptitle("Pairplot of the diabetes dataset", y=1.05, fontsize=16)
plt.show()